we first need to load the required libraries and read our dataset Load necessary library
library(ggplot2)
library(readr)
library(tidymodels)
library(splines)
library(mgcv)
library(caret)
library(earth)
library(pdp)
library(gridExtra)
library(plotly)
library(patchwork)
Next, we need to transform the R datafile into CSV format and read the datafile
load("dat1.RData")
load("dat2.RData")
# Save as CSVs
write.csv(dat1, "dat1.csv", row.names = FALSE)
write.csv(dat2, "dat2.csv", row.names = FALSE)
dat1 = read_csv("dat1.csv", na = c("NA","","."))
dat2 = read_csv("dat2.csv",na = c("NA","","."))
glimpse(dat1)
## Rows: 5,000
## Columns: 14
## $ id <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ age <dbl> 50, 71, 58, 63, 56, 59, 67, 62, 60, 64, 61, 67, 60, 60, 5…
## $ gender <dbl> 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, …
## $ race <dbl> 1, 1, 1, 1, 1, 3, 4, 1, 4, 1, 1, 3, 4, 1, 1, 3, 3, 2, 3, …
## $ smoking <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, …
## $ height <dbl> 176.1, 175.7, 168.7, 167.4, 162.7, 167.8, 175.4, 176.5, 1…
## $ weight <dbl> 68.3, 69.6, 76.9, 90.0, 83.9, 86.8, 91.4, 87.7, 85.7, 76.…
## $ bmi <dbl> 22.0, 22.6, 27.0, 32.1, 31.7, 30.8, 29.7, 28.1, 29.0, 31.…
## $ diabetes <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, …
## $ hypertension <dbl> 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, …
## $ SBP <dbl> 130, 149, 127, 138, 123, 132, 133, 130, 129, 134, 140, 12…
## $ LDL <dbl> 82, 129, 101, 93, 97, 108, 89, 96, 120, 135, 92, 131, 120…
## $ time <dbl> 76, 82, 168, 105, 193, 143, 63, 78, 61, 88, 90, 105, 120,…
## $ log_antibody <dbl> 10.647154, 9.889049, 10.900712, 9.906258, 9.563081, 8.837…
glimpse(dat2)
## Rows: 1,000
## Columns: 14
## $ id <dbl> 5001, 5002, 5003, 5004, 5005, 5006, 5007, 5008, 5009, 501…
## $ age <dbl> 58, 62, 71, 59, 69, 56, 65, 61, 62, 68, 57, 56, 64, 60, 5…
## $ gender <dbl> 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, …
## $ race <dbl> 4, 1, 4, 1, 1, 1, 1, 1, 1, 4, 1, 1, 1, 1, 1, 1, 4, 3, 1, …
## $ smoking <dbl> 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, …
## $ height <dbl> 176.4, 167.5, 179.3, 170.0, 166.5, 167.6, 175.9, 172.1, 1…
## $ weight <dbl> 86.4, 82.4, 79.2, 81.0, 74.8, 74.8, 69.2, 81.3, 82.1, 74.…
## $ bmi <dbl> 27.7, 29.4, 24.6, 28.0, 27.0, 26.6, 22.4, 27.4, 30.7, 26.…
## $ diabetes <dbl> 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, …
## $ hypertension <dbl> 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, …
## $ SBP <dbl> 130, 123, 145, 123, 150, 121, 132, 120, 142, 137, 127, 13…
## $ LDL <dbl> 115, 118, 149, 119, 142, 112, 127, 76, 86, 123, 140, 112,…
## $ time <dbl> 205, 229, 206, 163, 240, 206, 285, 185, 124, 127, 98, 199…
## $ log_antibody <dbl> 9.810890, 9.076660, 10.432296, 9.831918, 9.074990, 10.182…
Next, we will explore dat1 and perform Exploratory analysis
dat1 %>%
select(where(is.numeric)) %>%
summary()
## id age gender race
## Min. : 1 Min. :44.00 Min. :0.0000 Min. :1.000
## 1st Qu.:1251 1st Qu.:57.00 1st Qu.:0.0000 1st Qu.:1.000
## Median :2500 Median :60.00 Median :0.0000 Median :1.000
## Mean :2500 Mean :59.97 Mean :0.4854 Mean :1.749
## 3rd Qu.:3750 3rd Qu.:63.00 3rd Qu.:1.0000 3rd Qu.:3.000
## Max. :5000 Max. :75.00 Max. :1.0000 Max. :4.000
## smoking height weight bmi
## Min. :0.0000 Min. :150.2 Min. : 56.70 Min. :18.20
## 1st Qu.:0.0000 1st Qu.:166.1 1st Qu.: 75.40 1st Qu.:25.80
## Median :0.0000 Median :170.1 Median : 80.10 Median :27.60
## Mean :0.4952 Mean :170.1 Mean : 80.11 Mean :27.74
## 3rd Qu.:1.0000 3rd Qu.:174.2 3rd Qu.: 84.90 3rd Qu.:29.50
## Max. :2.0000 Max. :192.9 Max. :106.00 Max. :38.80
## diabetes hypertension SBP LDL
## Min. :0.0000 Min. :0.0000 Min. :101.0 Min. : 43.0
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:124.0 1st Qu.: 96.0
## Median :0.0000 Median :0.0000 Median :130.0 Median :110.0
## Mean :0.1544 Mean :0.4596 Mean :129.9 Mean :109.9
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:135.0 3rd Qu.:124.0
## Max. :1.0000 Max. :1.0000 Max. :155.0 Max. :185.0
## time log_antibody
## Min. : 30.0 Min. : 7.765
## 1st Qu.: 76.0 1st Qu.: 9.682
## Median :106.0 Median :10.089
## Mean :108.9 Mean :10.064
## 3rd Qu.:138.0 3rd Qu.:10.478
## Max. :270.0 Max. :11.961
# Count and proportion tables for categorical variables
dat1 %>%
select(gender, race, smoking, diabetes, hypertension) %>%
map(~table(.))
## $gender
## .
## 0 1
## 2573 2427
##
## $race
## .
## 1 2 3 4
## 3221 278 1036 465
##
## $smoking
## .
## 0 1 2
## 3010 1504 486
##
## $diabetes
## .
## 0 1
## 4228 772
##
## $hypertension
## .
## 0 1
## 2702 2298
dat1 %>%
select(gender, race, smoking, diabetes, hypertension) %>%
map(~prop.table(table(.)))
## $gender
## .
## 0 1
## 0.5146 0.4854
##
## $race
## .
## 1 2 3 4
## 0.6442 0.0556 0.2072 0.0930
##
## $smoking
## .
## 0 1 2
## 0.6020 0.3008 0.0972
##
## $diabetes
## .
## 0 1
## 0.8456 0.1544
##
## $hypertension
## .
## 0 1
## 0.5404 0.4596
Exploratory Visualizations:
ggplot(dat1, aes(x = log_antibody)) +
geom_histogram(bins = 30, fill = "skyblue", color = "white") +
labs(title = "Distribution of Antibody Levels", x = "log_antibody", y = "Count") +
theme_minimal()
plot_ly(
data = dat1,
x = ~time,
y = ~log_antibody,
type = "scatter",
mode = "markers", marker = list(opacity = 0.3)
) %>%
add_lines(
x = ~time,
y = ~fitted(loess(log_antibody ~ time, data = dat1)),
line = list(color = "red"),
name = "LOESS Smooth"
) %>%
layout(
title = "Antibody Levels Over Time",
xaxis = list(title = "Time Since Vaccination (days)"),
yaxis = list(title = "log_antibody")
)
Antibody levels increase initially, peak around day 75, then gradually decline.
Clear nonlinear relationship between log_antibody and time
Gender
p_gender = ggplot(dat1, aes(x = factor(gender), y = log_antibody)) +
geom_boxplot(fill = "lightblue") +
labs(title = "Antibody Levels by Gender", x = "Gender (0 = Female, 1 = Male)", y = "log_antibody") +
theme_minimal()
p_gender
Smoking status
p_smoking = ggplot(dat1, aes(x = factor(smoking), y = log_antibody)) +
geom_boxplot(fill = "plum") +
labs(title = "Antibody Levels by Smoking Status", x = "Smoking (0=Never, 1=Former, 2=Current)", y = "log_antibody") +
theme_minimal()
p_smoking
* Never and former smokers have similar distributions, with slightly
higher medians. * Current smokers tend to have slightly lower antibody
levels. * Smoking status may have a modest effect on immune
response.
Diabetes
p_diabetes = ggplot(dat1, aes(x = factor(diabetes), y = log_antibody)) +
geom_boxplot(fill = "salmon") +
labs(title = "Antibody Levels by Diabetes Status", x = "Diabetes (0 = No, 1 = Yes)", y = "log_antibody") +
theme_minimal()
p_diabetes
Race
p_race = ggplot(dat1, aes(x = factor(race), y = log_antibody)) +
geom_boxplot(fill = "khaki") +
labs(
title = "Antibody Levels by Race",
x = "Race (1 = White, 2 = Asian, 3 = Black, 4 = Hispanic)",
y = "log_antibody"
) +
theme_minimal()
p_race
* No substantial differences across race groups. * Medians are nearly
identical. * Interpretation: * Race may not be a strong predictor of
antibody levels here.
Hypertension status
p_htn = ggplot(dat1, aes(x = factor(hypertension), y = log_antibody)) +
geom_boxplot(fill = "lightgreen") +
labs(
title = "Antibody Levels by Hypertension Status",
x = "Hypertension (0 = No, 1 = Yes)",
y = "log_antibody"
) +
theme_minimal()
p_htn
We will creat a panel including all our categrical predictors analysis:
(p_gender | p_race | p_smoking) /
(p_diabetes | p_htn)
BMI
p_bmi = ggplot(dat1, aes(x = bmi, y = log_antibody)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "loess", se = FALSE, color = "darkgreen") +
labs(title = "Antibody levels by BMI", x = "BMI", y = "log_antibody") +
theme_minimal()
p_bmi
* Antibody levels are relatively flat until ~25–28 BMI, then slightly
decline. * Weak, nonlinear negative association at higher BMI.
Antibody vs. Age
p_age = ggplot(dat1, aes(x = age, y = log_antibody)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "loess", se = FALSE, color = "blue") +
labs(title = "Antibody Levels by Age", x = "Age (years)", y = "log_antibody") +
theme_minimal()
p_age
* Slight decline in antibody levels as age increases. * Smooth trend
suggests a gradual, nonlinear effect.
Antibody vs. Systolic Blood Pressure (SBP)
p_sbp = ggplot(dat1, aes(x = SBP, y = log_antibody)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "loess", se = FALSE, color = "darkred") +
labs(title = "Antibody Levels by Systolic Blood Pressure", x = "SBP (mmHg)", y = "log_antibody") +
theme_minimal()
p_sbp
* No clear trend — curve is almost flat across the SBP range. * SBP
appears to have minimal or no relationship with antibody levels.
Antibody vs. LDL Cholesterol
p_ldl = ggplot(dat1, aes(x = LDL, y = log_antibody)) +
geom_point(alpha = 0.3) +
geom_smooth(method = "loess", se = FALSE, color = "purple") +
labs(title = "Antibody Levels by LDL Cholesterol", x = "LDL (mg/dL)", y = "log_antibody") +
theme_minimal()
p_ldl
(p_bmi | p_age) /
(p_sbp | p_ldl)
Next step, we choose the GAM model training for option 1
mod_gam_full = gam(
log_antibody ~ s(time) + s(age) + s(bmi) +
LDL + SBP +
gender + race + smoking + diabetes + hypertension,
data = dat1
)
summary(mod_gam_full)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## log_antibody ~ s(time) + s(age) + s(bmi) + LDL + SBP + gender +
## race + smoking + diabetes + hypertension
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.1728744 0.2041632 49.827 < 2e-16 ***
## LDL -0.0002465 0.0003873 -0.636 0.525
## SBP 0.0008631 0.0016387 0.527 0.598
## gender -0.2957105 0.0149976 -19.717 < 2e-16 ***
## race -0.0097455 0.0069589 -1.400 0.161
## smoking -0.0561992 0.0112546 -4.993 6.13e-07 ***
## diabetes 0.0154195 0.0207306 0.744 0.457
## hypertension -0.0163727 0.0250865 -0.653 0.514
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(time) 7.862 8.561 46.75 <2e-16 ***
## s(age) 1.000 1.000 114.18 <2e-16 ***
## s(bmi) 5.127 6.197 68.43 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.214 Deviance explained = 21.7%
## GCV = 0.2809 Scale est. = 0.27966 n = 5000
plot(mod_gam_full, pages = 1)
All three smooth terms are highly significant (p < 2e-16)
Time, age, and BMI have nonlinear relationships with log_antibody
Their estimated degrees of freedom (edf):
s(time) = 7.86 → fairly wiggly
s(age) = 1.00 → almost linear
s(bmi) = 5.13 → moderately nonlinear
Interpretation:Smooth terms for time, age, and BMI were statistically significant, indicating non-linear effects on antibody levels. The model automatically adapted flexibility for each predictor based on the data.
Model performance of the traning set
# Predict log_antibody in the test set
pred_gam = predict(mod_gam_full, newdata = dat2)
# True observed values
true_values = dat2$log_antibody
# Create a tibble with true and predicted values
eval_df = tibble(
truth = dat2$log_antibody,
prediction = pred_gam
)
# Get multiple metrics at once
metrics(eval_df, truth = truth, estimate = prediction)
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 0.539
## 2 rsq standard 0.171
## 3 mae standard 0.433
Using the GAM model on this external dataset, the model achieved a root mean squared error (RMSE) of 0.539 and a mean absolute error (MAE) of 0.433, indicating modest prediction error. The model explained approximately 17.1% of the variation in log-transformed antibody levels (R² = 0.171). While performance was not exceptionally high, this is consistent with the inherent complexity and noise in biological data. The relatively stable performance across the two datasets suggests that the model generalizes reasonably well to new individuals.